import sys
import bz2
sys.path.append ("..")
import PYUtil

phrase_cache = {}

def load_pinyin_table ():
	hanzi_dict = PYUtil.load_pinyin_table (open ("../../../../data/pinyin_table.txt"))
	tmp = {}
	for key, value in hanzi_dict.items ():
		pinyins = []
		for pinyin, freq in value.items ():
			pinyins.append ((pinyin, freq))
		pinyins.sort (key = lambda v: v[1], reverse = True)
		tmp[key] = pinyins

	return tmp

def load_duoyin_phrases ():
	tmp = {}
	for line in bz2.BZ2File ("duoyin_phrase.txt.bz2", "r"):
	# for line in bz2.BZ2File ("qq_pinyin_1.0.1.txt.bz2", "r"):
		line = unicode (line, "utf8").strip ()
		phrase, pinyin = line.split ()
		if phrase not in tmp:
			tmp[phrase] = []
		tmp[phrase].append (pinyin)
	
	for line in file ("duoyin_phrase_manual.txt"):
		line = unicode (line, "utf8").strip ()
		phrase, pinyin = line.split ()
		if phrase not in tmp:
			tmp[phrase] = []
		tmp[phrase].append (pinyin)
		
	return tmp

def annotate_by_hanzi (phrase, hanzi_dict):
	if len (phrase) == 1:
		for p in hanzi_dict[phrase[0]]:
			yield p[0]
	else:
		for p in hanzi_dict[phrase[0]]:
			for q in annotate_by_hanzi (phrase[1:], hanzi_dict):
				yield u"'".join ([p[0], q])

def annotate (phrase, hanzi_dict, phrase_dict):
	if phrase in phrase_dict:
		for p in phrase_dict[phrase]:
			yield p, True
		return
	if phrase in phrase_cache:
		pinyins= phrase_cache[phrase]
		for p, ok in pinyins:
			yield p, ok
		return
	
	pinyins = list (annotate_by_hanzi (phrase, hanzi_dict))
	
	if len (pinyins) == 1:
		yield pinyins[0], True
	elif len (phrase) <= 2:
		for p in pinyins:
			yield p, False
	else:
		pp1 = None
		pp2 = None
		ll = len (pinyins)
		for l in range (len(phrase) - 1, 0 , -1):
			phrase_tmp = phrase[:l]
			p1 = list (annotate (phrase_tmp, hanzi_dict, phrase_dict))
			if phrase_tmp not in phrase_cache:
				phrase_cache[phrase_tmp] = p1
			phrase_tmp = phrase[l:]
			p2 = list (annotate (phrase_tmp, hanzi_dict, phrase_dict))
			if phrase_tmp not in phrase_cache:
				phrase_cache[phrase_tmp] = p2
				
			if len (p1) == 1 and len (p2) == 1:
				yield u"'".join ([p1[0][0], p2[0][0]]), True
				return
			
			lln = len (p1) * len (p2)
			if lln < ll:
				ll = lln
				pp1 = p1
				pp2 = p2
		if pp1 == None and pp2 == None:
			for p in pinyins:
				yield p, False
		else:
			for p1, ok1 in pp1:
				for p2, ok2 in pp2:
					yield u"'".join ([p1, p2]), ok1 and ok2

def main ():
	hanzi_dict = load_pinyin_table ()
	phrase_dict = load_duoyin_phrases ()
	lineno = -1
	for line in sys.stdin:
		lineno += 1
		line = unicode (line, "utf8").strip ()
		phrase, freq = line.split ()
		freq = int (freq)
		try:
			pinyins = list (annotate (phrase, hanzi_dict, phrase_dict))
		except:
			output = u"ERROR %d: %s\t%d" % (lineno, phrase, freq)
			print >> sys.stderr, output.encode ("utf8")
			
		for p, ok in pinyins:
			output = u"%s\t%s\t%d" % (phrase, p, freq)
			if ok:
				print >> sys.stdout, output.encode ("utf8")
			else:
				print >> sys.stderr, output.encode ("utf8")


if __name__ == "__main__":
	main ()
